{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = lambda words : words.split()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['This', 'is', 'a', 'test', 'for', 'tokenizer']"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer(\"This is a test for tokenizer\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchtext.data import Field"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "Review = Field(sequential=True, tokenize=tokenizer, lower=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "Label = Field(sequential=False, use_vocab=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', lower=True, fix_length=50)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', unk_token='<unk>')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "SequenceField = Field(tokenize=tokenizer, init_token='<sos>', eos_token='<eos>', unk_token='<unk>', batch_first=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchtext.data import TabularDataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_datafields = [(\"id\", None),\n",
    "                 (\"content\", Review), (\"Business\", Label),\n",
    "                 (\"SciTech\", Label), (\"Sports\", Label),\n",
    "                 (\"World\", Label)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_datafields = [(\"id\", None),\n",
    "                  (\"content\", Review)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train, valid = TabularDataset.splits(path='NewsClassification', \n",
    "                                    train='train.csv',\n",
    "                                    valid='valid.csv',\n",
    "                                    format='csv',\n",
    "                                    skip_header=True,\n",
    "                                    fields=train_datafields)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = TabularDataset(path=\"NewsClassification/test.csv\",\n",
    "                    format='csv',\n",
    "                    skip_header=True,\n",
    "                    fields=test_datafields)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Review.build_vocab(train, min_freq=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchtext.data import Iterator, BucketIterator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "BATCH_SIZE = 128"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_iter, valid_iter, test_iter = BucketIterator.splits(\n",
    "                                     (train, valid, test),\n",
    "                                     batch_size=BATCH_SIZE,\n",
    "                                     device=device,\n",
    "                                     sort_key=lambda x: len(x.comment_text), \n",
    "                                     sort_within_batch=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "from torchtext import vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "vec = vocab.Vectors('glove.6B.50d.txt', './vec/glove_embedding/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Review.build_vocab(train, min_freq=2, vectors=vec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch.nn as nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "class LSTMClassifier(nn.Module):\n",
    "    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout):\n",
    "        self.embedding = nn.Embedding(len(Review.vocab), embedding_dim)\n",
    "        self.rnn = nn.LSTM(embedding_dim, hidden_dim)\n",
    "        self.fc = nn.Linear(hidden_dim, output_dim)\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "    \n",
    "    def forward(self, x):\n",
    "        x = self.embedding(x)\n",
    "        output, (hidden, cell) = self.rnn(x)\n",
    "        hidden = self.dropout(hidden)\n",
    "        return self.fc(hidden)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "EMBEDDING_DIM = 100\n",
    "HIDDEN_DIM = 256\n",
    "OUTPUT_DIM = 1\n",
    "DROPOUT = 0.5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class MultiLSTMClassifier(nn.Module):\n",
    "    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, num_layers):\n",
    "        self.embedding = nn.Embedding(len(Review.vocab), embedding_dim)\n",
    "        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers)\n",
    "        self.fc = nn.Linear(hidden_dim, output_dim)\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "    \n",
    "    def forward(self, x):\n",
    "        x = self.embedding(x)\n",
    "        output, (hidden, cell) = self.rnn(x)\n",
    "        hidden = self.dropout(hidden)\n",
    "        return self.fc(hidden[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "EMBEDDING_DIM = 100\n",
    "HIDDEN_DIM = 256\n",
    "OUTPUT = 1\n",
    "DROPOUT = 0.5\n",
    "NUM_LAYERS = 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = MultiLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, NUM_LAYERS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "class BiLSTMClassifier(nn.Module):\n",
    "    def __init__(self, embedding_dim, hidden_dim, output_dim, dropout, num_layers):\n",
    "        self.embedding = nn.Embedding(len(Review.vocab), embedding_dim)\n",
    "        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=True)\n",
    "        self.fc = nn.Linear(2*hidden_dim, output_dim)\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "    \n",
    "    def forward(self, x):\n",
    "        x = self.embedding(x)\n",
    "        output, (hidden, cell) = self.rnn(x)\n",
    "        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))\n",
    "        return self.fc(hidden.squeeze(0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = BiLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT, NUM_LAYERS)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
